Source Code of org.terrier.utility.io.HadoopUtility$MapReduceBase

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is HadoopUtility.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   
 */
package org.terrier.utility.io;


import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.util.HashSet;
import java.util.Properties;
import java.util.Random;
import java.util.Set;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.log4j.Logger;
import org.terrier.structures.Index;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.Files;


/** Utility class for the setting up and configuring of Terrier MapReduce jobs.
 * General scheme for a Hadoop Job
 * <code>
 * JobFactory jf = HadoopUtility.getJobFactory("TerrierJob");
 * JobConf jc = jf.newJob();
 * HadoopUtility.makeTerrierJob(jc);
 * &47;&47; populate jc
 * &47;&47; if an index is needed in the MR job:
 * HadoopUtility.toHConfiguration(index, jc);
 * Running rj = JobClient.runJob(jc);
 * HadoopUtility.finishTerrierJob(jc);
 * </code>
 * During a MR job, the configure method should call HadoopUtility.loadTerrierJob(jc);
 * To obtain an index, Index index = HadoopUtility.fromHConfiguration(jc);
 * @author Craig Macdonald
 * @since 2.2. 
 */
@SuppressWarnings("deprecation")
public class HadoopUtility {
  
  protected static final Logger logger = Logger.getLogger(HadoopUtility.class);
  
  /** Handy base class for MapReduce jobs. */
  public static abstract class MapReduceBase<K1,V1,K2,V2,K3,V3> implements Mapper<K1,V1,K2,V2>, Reducer<K2,V2,K3,V3>
  {
    protected JobConf jc;
    
    /** {@inheritDoc} */
    public void configure(JobConf _jc) {
      this.jc = _jc;
            
      //1. configure application
      try{ 
        HadoopUtility.loadTerrierJob(_jc);
      } catch (Exception e) {
        throw new Error("Cannot load ApplicationSetup", e);
      }
      
      //2. configurure this class
      try{
        if (isMap(_jc))
        {
          configureMap();
        } else {
          configureReduce();
        }
      } catch (Exception e) { 
        throw new Error("Cannot configure indexer", e);
      }
    }
    
    protected abstract void configureMap() throws IOException;
    protected abstract void configureReduce() throws IOException;
    
    /** Called at end of map or reduce task. Calls internally closeMap() or closeReduce() */
    public void close() throws IOException {
      if (isMap(jc))
      {
        closeMap();
      } else {
        closeReduce();
      }
    }


    protected abstract void closeMap() throws IOException;
    protected abstract void closeReduce() throws IOException;    
  }
  
  /** Utility method to detect if a task is a Map task or not */
  public static final boolean isMap(JobConf jc) {
    return TaskAttemptID.forName(jc.get("mapred.task.id")).isMap();
  }
  
  /** Utility method to set MapOutputCompression if possible.
   * In general, I find that MapOutputCompression fails for
   * local job trackers, so this code checks the job tracker
   * location first.
   * @param conf JobConf of job.
   * @return true if MapOutputCompression was set.
   */
  public static boolean setMapOutputCompression(JobConf conf)
  {
    if (! conf.get("mapred.job.tracker").equals("local"))
    {
      conf.setMapOutputCompressorClass(GzipCodec.class);
      conf.setCompressMapOutput(true);
      return true;
    }
    return false;
  }
  
  /** Utility method to set JobOutputCompression if possible.
   * In general, I find that JobOutputCompression fails for
   * local job trackers, so this code checks the job tracker
   * location first.
   * @param conf JobConf of job.
   * @return true if JobOutputCompression was set.
   */
  public static boolean setJobOutputCompression(JobConf conf)
  {
    if (! conf.get("mapred.job.tracker").equals("local"))
    {
      FileOutputFormat.setCompressOutput(conf, true);
      FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
      return true;
    }
    return false;
  }


  /** Saves the current ApplicationSetup to the specified JobConf.
   * After the JobConf job has run, use finishTerrierJob() to delete any
   * leftover files */
  public static void makeTerrierJob(JobConf jobConf) throws IOException
  {
    if (jobConf.get("mapred.job.tracker").equals("local"))
      return;
    try{
      saveApplicationSetupToJob(jobConf, true);
      saveClassPathToJob(jobConf);
     } catch (Exception e) {
      throw new WrappedIOException("Cannot HadoopUtility.makeTerrierJob", e);
    }
  }
  
  /** When the current ApplicationSetup has been saved to the JobConf, by makeTerrierJob(),
   * use this method during the MR job to properly initialise Terrier.
   */
  public static void loadTerrierJob(JobConf jobConf) throws IOException
  {
    if (jobConf.get("mapred.job.tracker").equals("local"))
      return;
    try{
      HadoopPlugin.setGlobalConfiguration(jobConf);
      loadApplicationSetup(jobConf);
    } catch (Exception e) {
       throw new WrappedIOException("Cannot HadoopUtility.loadTerrierJob", e);
    }
  }
  
  /** Call this after the MapReduce job specified by jobConf has completed,
   * to clean up any leftover files */
  public static void finishTerrierJob(JobConf jobConf) throws IOException
  {
    if (jobConf.get("mapred.job.tracker").equals("local"))
      return;
    deleteJobApplicationSetup(jobConf);
    removeClassPathFromJob(jobConf);
  }
  
  protected static void removeClassPathFromJob(JobConf jobConf) throws IOException
  {
    final String[] jars = findJarFiles(new String[]{
        System.getenv().get("CLASSPATH"),
        System.getProperty("java.class.path")
      });
    final FileSystem defFS = FileSystem.get(jobConf);
    for (String jarFile : jars)
    {
      Path srcJarFilePath = new Path("file:///"+jarFile);
      String filename = srcJarFilePath.getName();
      //for a given job, makeTemporaryFile will return the same temporary id
      Path tmpJarFilePath = makeTemporaryFile(jobConf, filename);
      defFS.delete(tmpJarFilePath, false);
    }
  }


  protected static void saveClassPathToJob(JobConf jobConf) throws IOException
  {
    //logger.info("Copying classpath to job");
    if (jobConf.getBoolean("terrier.classpath.copied", false))
    {
      return;
    }
    jobConf.setBoolean("terrier.classpath.copied", true);
    final String[] jars = findJarFiles(new String[]{
        System.getenv().get("CLASSPATH"),
        System.getProperty("java.class.path")
      });
    final FileSystem defFS = FileSystem.get(jobConf);
    for (String jarFile : jars)
    {
      //logger.debug("Adding " + jarFile + " to job class path");
      Path srcJarFilePath = new Path("file:///"+jarFile);
      String filename = srcJarFilePath.getName();
      Path tmpJarFilePath = makeTemporaryFile(jobConf, filename);      
      defFS.copyFromLocalFile(srcJarFilePath, tmpJarFilePath);
      DistributedCache.addFileToClassPath(tmpJarFilePath, jobConf);
    }
     DistributedCache.createSymlink(jobConf);
  }


  protected static String[] findJarFiles(String [] classPathLines)
  {
    Set<String> jars = new HashSet<String>();
    for (String locationsLine : classPathLines)
    {
      if (locationsLine == null)
        continue;
      for (String CPentry : locationsLine.split(":"))
      {
        if (CPentry.endsWith(".jar"))
          jars.add(new File(CPentry).getAbsoluteFile().toString());
      }
    }
    return jars.toArray(new String[0]);
  }


  protected static final String[] checkSystemProperties = {"file", "java", "line", "os", "path", "sun", "user"};
  protected static final Random random = new Random();


  protected static Path makeTemporaryFile(JobConf jobConf, String filename) throws IOException
  {
    final int randomKey = jobConf.getInt("terrier.tempfile.id", random.nextInt());
    jobConf.setInt("terrier.tempfile.id", randomKey);
    FileSystem defFS = FileSystem.get(jobConf);
        final Path tempFile = new Path("/tmp/"+(randomKey)+"-"+filename);
        defFS.deleteOnExit(tempFile);
    return tempFile;
  }
  
  protected static void deleteJobApplicationSetup(JobConf jobConf) throws IOException
  {
    FileSystem remoteFS = FileSystem.get(jobConf);
    String copiedTerrierShare = jobConf.get("terrier.share.copied", null);
    if (copiedTerrierShare != null)
    {
      logger.debug("Removing temporary terrier.share at " + copiedTerrierShare);
      Files.delete(copiedTerrierShare);
    }
    for(String filename : new String[]{"terrier.properties", "system.properties"})
    {  
      Path p = findCacheFileByFragment(jobConf, filename);
      remoteFS.delete(p, false);
    }
  }
  
  protected static void saveApplicationSetupToJob(JobConf jobConf, boolean getFreshProperties) throws Exception
  {
    // Do we load a fresh properties File?
    //TODO fix, if necessary
    //if (getFreshProperties)
    //  loadApplicationSetup(new Path(ApplicationSetup.TERRIER_HOME));
    
    FileSystem remoteFS = FileSystem.get(jobConf);
    URI remoteFSURI = remoteFS.getUri();
    //make a copy of the current application setup properties, these may be amended
    //as some files are more globally accessible
    final Properties propertiesDuringJob = new Properties();
    Properties appProperties = ApplicationSetup.getProperties();
    for (Object _key: appProperties.keySet())
    {
      String key = (String)_key;
      propertiesDuringJob.put(key, appProperties.get(key));
    }


    //the share folder is needed during indexing, save this on DFS
    if (Files.getFileSystemName(ApplicationSetup.TERRIER_SHARE).equals("local"))
    {
      Path tempTRShare = makeTemporaryFile(jobConf, "terrier.share");
      propertiesDuringJob.setProperty("terrier.share", remoteFSURI.resolve(tempTRShare.toUri()).toString());
      if (Files.exists(ApplicationSetup.TERRIER_SHARE))
      {
        jobConf.set("terrier.share.copied", remoteFSURI.resolve(tempTRShare.toUri()).toString());
        //logger.info("Copying terrier share/ directory ("+ApplicationSetup.TERRIER_SHARE+") to shared storage area ("+remoteFSURI.resolve(tempTRShare.toUri()).toString()+")");
        FileUtil.copy(
            FileSystem.getLocal(jobConf), new Path(ApplicationSetup.TERRIER_SHARE),
            remoteFS, tempTRShare,
            false, false, jobConf);
      }
      else
      {
        //logger.warn("No terrier.share folder found at "+ApplicationSetup.TERRIER_SHARE+", copying skipped");
      }
    }


    //copy the terrier.properties content over
    Path tempTRProperties = makeTemporaryFile(jobConf, "terrier.properties");
    logger.debug("Writing terrier properties out to DFS "+tempTRProperties.toString());
    OutputStream out = remoteFS.create(tempTRProperties);
    remoteFS.deleteOnExit(tempTRProperties);
    propertiesDuringJob.store(out, "Automatically generated by HadoopUtility.saveApplicationSetupToJob()");
    out.close();
    out = null;
    DistributedCache.addCacheFile(tempTRProperties.toUri().resolve(new URI("#terrier.properties")), jobConf);
    DistributedCache.createSymlink(jobConf);
  
    //copy the non-JVM system properties over as well
    Path tempSysProperties = makeTemporaryFile(jobConf, "system.properties");  
    DataOutputStream dos = FileSystem.get(jobConf).create(tempSysProperties);
    logger.debug("Writing system properties out to DFS "+tempSysProperties.toString());
    for (Object _propertyKey : System.getProperties().keySet())
    {
      String propertyKey = (String)_propertyKey;
      if (! startsWithAny(propertyKey, checkSystemProperties))
      {
        dos.writeUTF(propertyKey);
        dos.writeUTF(System.getProperty(propertyKey));
      }
    }
    dos.writeUTF("FIN");
    dos.close();
    dos = null;
    DistributedCache.addCacheFile(tempSysProperties.toUri().resolve(new URI("#system.properties")), jobConf);
  }


  protected static Path findCacheFileByFragment(JobConf jc, String name) throws IOException
  {
    URI[] ps = DistributedCache.getCacheFiles(jc);
    URI defaultFS = FileSystem.getDefaultUri(jc);
    if (ps == null)
      return null;
    for (URI _p : ps)
    {
      final URI p = defaultFS.resolve(_p);
      if (p.getFragment().equals(name))
      {
        logger.debug("Found matching path in DistributedCache in search for "+name+" : " +new Path(p.getScheme(), p.getAuthority(), p.getPath()).toString());
        return new Path(p.getScheme(), p.getAuthority(), p.getPath());
      }
    }
    return null;
  }
  
  protected static void loadApplicationSetup(JobConf jobConf) throws IOException
  {
    //logger.info("Reloading Application Setup");
    //we dont use Terrier's IO layer here, because it is not yet initialised
    FileSystem sharedFS = FileSystem.get(jobConf);
    Path terrierPropertiesFile =  findCacheFileByFragment(jobConf, "terrier.properties");  
    Path systemPropertiesFile = findCacheFileByFragment(jobConf, "system.properties");
  
    if (systemPropertiesFile != null && sharedFS.exists(systemPropertiesFile))
    {
      DataInputStream dis = sharedFS.open(systemPropertiesFile);
      while(true)
      {
        String key = dis.readUTF();
        if (key.equals("FIN"))
          break;
        String value = dis.readUTF();
        System.setProperty(key, value);
      }
      dis.close();
    }
    else
    {
      //logger.warn("No system.properties file found at "+systemPropertiesFile);
    }


    if (terrierPropertiesFile != null && sharedFS.exists(terrierPropertiesFile))
    {
      ApplicationSetup.configure(sharedFS.open(terrierPropertiesFile));
    }
    else
    {
      throw new java.io.FileNotFoundException("No terrier.properties file found at "+terrierPropertiesFile);
    }
  }
  
  /** Get an Index saved to the specifified Hadoop configuration by toHConfiguration() */
  public static Index fromHConfiguration(Configuration c)
  {
    return Index.createIndex(c.get("terrier.index.path"), c.get("terrier.index.prefix"));
  }
  
  /** Puts the specified index onto the given Hadoop configuration */
  public static void toHConfiguration(Index i, Configuration c)
  {
    c.set("terrier.index.path", i.getPath());
    c.set("terrier.index.prefix", i.getPrefix());
  }
  
  /**
   * Returns true if source contains any of the Strings held in checks. Case insensitive.
   * @param source String to check
   * @param checks Strings to check for
   * @return true if source starts with one of checks, false otherwise.
   */
  protected static boolean startsWithAny(String source, String[] checks) {
    for (String s:checks) {
      if (source.toLowerCase().startsWith(s.toLowerCase())) return true;
    }
    return false;
  }
}
Source Code of org.terrier.utility.io.HadoopUtility$MapReduceBase

Related Classes of org.terrier.utility.io.HadoopUtility$MapReduceBase